import pandas as pd
import numpy as np
import plotly.express as pxPFAS
Ejemplo PFAS
Primero importamos las librerias ncesesarias.
pfas_data = pd.read_csv("../data/pfas_data.csv")pfas_data.head()
pfas_data.info()
pfas_data["RDKIT_SMILES"].isna().mean()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6071 entries, 0 to 6070
Columns: 2091 entries, RDKIT_SMILES to PubchemFP880
dtypes: float64(1176), int64(914), object(1)
memory usage: 96.9+ MB
np.float64(0.0)
molecules = pfas_data["RDKIT_SMILES"].tolist()
molecules[:5]['FC(F)Cl',
'FC(F)=C(F)F',
'FC(F)(Cl)C(F)(Cl)Cl',
'C=C(F)F',
'OC(C(F)(F)F)C(F)(F)F']
!pip install rdkitRequirement already satisfied: rdkit in /opt/venv/lib/python3.13/site-packages (2025.3.5)
Requirement already satisfied: numpy in /opt/venv/lib/python3.13/site-packages (from rdkit) (2.3.2)
Requirement already satisfied: Pillow in /opt/venv/lib/python3.13/site-packages (from rdkit) (11.3.0)
from rdkit import Chem
from rdkit.Chem import AllChemmols = [Chem.MolFromSmiles(smiles) for smiles in molecules]mols[:5][<rdkit.Chem.rdchem.Mol at 0xffff701bdaf0>,
<rdkit.Chem.rdchem.Mol at 0xffff701bdc40>,
<rdkit.Chem.rdchem.Mol at 0xffff701bdcb0>,
<rdkit.Chem.rdchem.Mol at 0xffff701bdd20>,
<rdkit.Chem.rdchem.Mol at 0xffff701bdd90>]
fps = [AllChem.GetMACCSKeysFingerprint(mol) for mol in mols]fps[:5][<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff703312a0>,
<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331620>,
<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331690>,
<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331700>,
<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0xffff70331770>]
fps_array = np.array(fps)
fps_arrayarray([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 1, 0, 0],
[0, 0, 0, ..., 1, 0, 0],
[0, 0, 0, ..., 1, 0, 0]], shape=(6071, 167))
from sklearn.manifold import TSNEtsne = TSNE(n_components = 2, perplexity = 50, random_state = 42)fps_tsne = tsne.fit_transform(fps_array)tsne_df = pd.DataFrame(data = fps_tsne, columns = ["Component_1", "Component_2"], index = pfas_data["RDKIT_SMILES"])
tsne_df.head()| Component_1 | Component_2 | |
|---|---|---|
| RDKIT_SMILES | ||
| FC(F)Cl | -56.922298 | 60.720055 |
| FC(F)=C(F)F | -69.412216 | 26.313509 |
| FC(F)(Cl)C(F)(Cl)Cl | -32.573349 | 29.395290 |
| C=C(F)F | -69.444672 | 26.623600 |
| OC(C(F)(F)F)C(F)(F)F | -22.384430 | -27.337769 |
pfas_classes = pd.read_csv("../data/pfas_classes.csv", index_col = "RDKIT_SMILES")tsne_df_joined = pd.merge(tsne_df, pfas_classes, on = "RDKIT_SMILES", how = "inner")
tsne_df_joined.head()| Component_1 | Component_2 | First_Class | Second_Class | |
|---|---|---|---|---|
| RDKIT_SMILES | ||||
| FC(F)Cl | -56.922298 | 60.720055 | PFAS derivatives | PFAS halogen derivatives |
| FC(F)=C(F)F | -69.412216 | 26.313509 | PFAS derivatives | With fluorinated C=C or C=O carbon |
| FC(F)(Cl)C(F)(Cl)Cl | -32.573349 | 29.395290 | PFAS derivatives | PFAS halogen derivatives |
| C=C(F)F | -69.444672 | 26.623600 | PFAS derivatives | With fluorinated C=C or C=O carbon |
| OC(C(F)(F)F)C(F)(F)F | -22.384430 | -27.337769 | Other aliphatics | Others |
fig = px.scatter(tsne_df_joined,
x = "Component_1",
y = "Component_2",
color = "First_Class",
hover_name = tsne_df_joined.index,
hover_data = {"Component_1": ":.2f",
"Component_2": ":.2f",
"First_Class": True})
fig.update_layout(title = "Moléculas de PFAS mediante t-SNE",
xaxis_title = "Componente t-SNE 1",
yaxis_title = "Componente t-SNE 2",
height = 600, width = 900)
fig.show();Ahora vamos a hacerlo en R3
tsne = TSNE(n_components = 3, perplexity = 50, random_state = 42)fps_tsne = tsne.fit_transform(fps_array)tsne_df = pd.DataFrame(data = fps_tsne,
columns = ["Component_1", "Component_2", "Component_3"],
index = pfas_data["RDKIT_SMILES"])tsne_df_joined = pd.merge(tsne_df, pfas_classes, on = "RDKIT_SMILES", how = "inner")
tsne_df_joined.head()| Component_1 | Component_2 | Component_3 | First_Class | Second_Class | |
|---|---|---|---|---|---|
| RDKIT_SMILES | |||||
| FC(F)Cl | -15.876364 | 24.557077 | 7.744306 | PFAS derivatives | PFAS halogen derivatives |
| FC(F)=C(F)F | -24.740488 | 3.906794 | -2.053560 | PFAS derivatives | With fluorinated C=C or C=O carbon |
| FC(F)(Cl)C(F)(Cl)Cl | -9.861135 | 1.608409 | 8.375186 | PFAS derivatives | PFAS halogen derivatives |
| C=C(F)F | -24.910532 | 3.875210 | -1.773332 | PFAS derivatives | With fluorinated C=C or C=O carbon |
| OC(C(F)(F)F)C(F)(F)F | -8.248621 | -9.961960 | -0.368669 | Other aliphatics | Others |
fig = px.scatter_3d(tsne_df_joined,
x = "Component_1",
y = "Component_2",
z = "Component_3",
color = "First_Class",
hover_name = tsne_df_joined.index,
hover_data = {"Component_1": ":.2f",
"Component_2": ":.2f",
"Component_3": ":.2f",
"First_Class": True})
fig.update_layout(scene = dict(xaxis_title = "Componente t-SNE 1",
yaxis_title = "Componente t-SNE 2",
zaxis_title = "Componente t-SNE 3"))
fig.show();